df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 21178 entries, 1958-01-04 to 2015-12-31 Data columns (total 19 columns): tas_0 21178 non-null float64 tas_1 21178 non-null float64 tas_2 21178 non-null float64 tas_3 21178 non-null float64 tas_4 21178 non-null float64 tas_5 21178 non-null float64 tas_6 21178 non-null float64 tas_7 21178 non-null float64 tas_8 21178 non-null float64 pr_0 21178 non-null float64 pr_1 21178 non-null float64 pr_2 21178 non-null float64 pr_3 21178 non-null float64 pr_4 21178 non-null float64 pr_5 21178 non-null float64 pr_6 21178 non-null float64 pr_7 21178 non-null float64 pr_8 21178 non-null float64 flow 21178 non-null float64 dtypes: float64(19) memory usage: 3.2 MB
df.describe()
| tas_0 | tas_1 | tas_2 | tas_3 | tas_4 | tas_5 | tas_6 | tas_7 | tas_8 | pr_0 | pr_1 | pr_2 | pr_3 | pr_4 | pr_5 | pr_6 | pr_7 | pr_8 | flow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 | 21178.000000 |
| mean | -0.047006 | 0.714643 | 2.979516 | 1.489362 | 1.274884 | 2.313594 | 1.890589 | 2.839834 | 3.527821 | 1.991529 | 2.007583 | 2.076490 | 2.213070 | 2.265171 | 2.214652 | 2.305860 | 2.385376 | 2.255959 | 6.914146 |
| std | 8.261391 | 8.396938 | 9.050337 | 8.571648 | 8.467370 | 8.613825 | 8.575300 | 8.797301 | 8.959525 | 4.025939 | 4.347522 | 4.496716 | 4.560043 | 4.756833 | 4.908967 | 4.807279 | 5.424111 | 5.184413 | 9.665353 |
| min | -33.799999 | -32.599998 | -33.299999 | -33.700001 | -32.799999 | -32.200001 | -33.200001 | -32.599998 | -30.799999 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000031 |
| 25% | -5.800000 | -5.200000 | -3.100000 | -4.400000 | -4.600000 | -3.700000 | -4.000000 | -3.200000 | -2.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.156123 |
| 50% | 0.200000 | 0.900000 | 3.400000 | 1.800000 | 1.500000 | 2.500000 | 2.200000 | 3.100000 | 3.700000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.151342 |
| 75% | 6.600000 | 7.600000 | 10.600000 | 8.500000 | 8.200000 | 9.500000 | 9.000000 | 10.200000 | 11.100000 | 2.500000 | 2.200000 | 2.200000 | 2.600000 | 2.600000 | 2.200000 | 2.600000 | 2.200000 | 2.000000 | 8.606681 |
| max | 20.299999 | 21.200001 | 23.400000 | 21.700001 | 21.700001 | 22.700001 | 21.799999 | 23.299999 | 24.000000 | 61.500000 | 60.700001 | 66.800003 | 64.000000 | 62.400002 | 61.299999 | 64.500000 | 67.699997 | 73.699997 | 85.397653 |
# timeseries normalized?
# strong annual cycle embedded >> need to remove before any fitting
# any trends?
tas_df.plot(subplots=True, layout=(9, 1), figsize=(8, 25), sharex=True) # certainly not normalized
sns.despine()#)
plt.show()
tas_df[tas_df['tas_0']<-30]
# The cold days seem quite universal across locations, for example, 1987-01-10 to 1987-01-11, and 1978-12-31 to 1979-01-01
# The cold days seem quite universal across locations, for example, 1987-01-10 to 1987-01-11, and 1978-12-31 to 1979-01-01
| tas_0 | tas_1 | tas_2 | tas_3 | tas_4 | tas_5 | tas_6 | tas_7 | tas_8 | |
|---|---|---|---|---|---|---|---|---|---|
| 1978-12-31 | -31.799999 | -30.600000 | -25.900000 | -29.600000 | -29.500000 | -27.200001 | -28.500000 | -25.100000 | -23.600000 |
| 1979-01-01 | -33.799999 | -32.299999 | -26.799999 | -31.500000 | -31.200001 | -28.600000 | -30.600000 | -26.700001 | -25.299999 |
| 1982-01-08 | -32.500000 | -31.000000 | -27.299999 | -29.700001 | -29.900000 | -27.600000 | -29.000000 | -27.900000 | -26.700001 |
| 1987-01-10 | -30.299999 | -29.500000 | -26.900000 | -28.200001 | -28.200001 | -27.700001 | -27.500000 | -26.700001 | -26.000000 |
| 1987-01-11 | -32.099998 | -31.900000 | -33.299999 | -33.700001 | -32.799999 | -32.200001 | -33.200001 | -32.599998 | -30.799999 |
| 2010-01-09 | -33.799999 | -32.599998 | -23.600000 | -31.200001 | -32.500000 | -26.799999 | -29.700001 | -25.299999 | -23.299999 |
#label=['Loc1','Loc2','Loc3','Loc4','Loc5','Loc6','Loc7','Loc8','Loc9']
ax=tas_df.plot(kind='box')
#ax.set_xticklabels(label,ha='center')
#ax = sns.boxplot(data=tas_df)
plt.title('Air_temperature spreads')
plt.ylabel('degC')
plt.show()
tas_df.corr() ## very high correlation because they share a similar annual cycle,
## The air_temperatures between locations appear to correlate pretty well.
| tas_0 | tas_1 | tas_2 | tas_3 | tas_4 | tas_5 | tas_6 | tas_7 | tas_8 | |
|---|---|---|---|---|---|---|---|---|---|
| tas_0 | 1.000000 | 0.998961 | 0.977102 | 0.996461 | 0.997470 | 0.990438 | 0.994347 | 0.982094 | 0.971060 |
| tas_1 | 0.998961 | 1.000000 | 0.982934 | 0.998647 | 0.999481 | 0.994893 | 0.997278 | 0.987590 | 0.977901 |
| tas_2 | 0.977102 | 0.982934 | 1.000000 | 0.988455 | 0.986177 | 0.993120 | 0.991115 | 0.998222 | 0.996420 |
| tas_3 | 0.996461 | 0.998647 | 0.988455 | 1.000000 | 0.999552 | 0.996875 | 0.999490 | 0.991545 | 0.982327 |
| tas_4 | 0.997470 | 0.999481 | 0.986177 | 0.999552 | 1.000000 | 0.996769 | 0.998841 | 0.990496 | 0.981370 |
| tas_5 | 0.990438 | 0.994893 | 0.993120 | 0.996875 | 0.996769 | 1.000000 | 0.998108 | 0.996892 | 0.991014 |
| tas_6 | 0.994347 | 0.997278 | 0.991115 | 0.999490 | 0.998841 | 0.998108 | 1.000000 | 0.994139 | 0.985866 |
| tas_7 | 0.982094 | 0.987590 | 0.998222 | 0.991545 | 0.990496 | 0.996892 | 0.994139 | 1.000000 | 0.996982 |
| tas_8 | 0.971060 | 0.977901 | 0.996420 | 0.982327 | 0.981370 | 0.991014 | 0.985866 | 0.996982 | 1.000000 |
sns.pairplot(tas_df)
<seaborn.axisgrid.PairGrid at 0x12903d470>
# Remove seasonal cycle then estimate correlation
climatology=tas_df.groupby(tas_df.index.month).mean()
climatology.plot() ## seasonal cycle
<matplotlib.axes._subplots.AxesSubplot at 0x113683860>
anomaly = lambda x: (x - x.mean())
deseasoned_tas=tas_df.groupby(tas_df.index.month).transform(anomaly)
tas_df['tas_0']['1960':'1965'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x115febe10>
deseasoned_tas['tas_0']['1960':'1965'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1161f8390>
sns.pairplot(deseasoned_tas)
<seaborn.axisgrid.PairGrid at 0x1161745c0>
deseasoned_tas.corr()
| tas_0 | tas_1 | tas_2 | tas_3 | tas_4 | tas_5 | tas_6 | tas_7 | tas_8 | |
|---|---|---|---|---|---|---|---|---|---|
| tas_0 | 1.000000 | 0.996964 | 0.924950 | 0.988855 | 0.992109 | 0.968863 | 0.981680 | 0.939819 | 0.899106 |
| tas_1 | 0.996964 | 1.000000 | 0.940546 | 0.995158 | 0.998128 | 0.982121 | 0.990185 | 0.955442 | 0.918523 |
| tas_2 | 0.924950 | 0.940546 | 1.000000 | 0.959395 | 0.951193 | 0.973911 | 0.968452 | 0.993253 | 0.985519 |
| tas_3 | 0.988855 | 0.995158 | 0.959395 | 1.000000 | 0.998353 | 0.988802 | 0.998118 | 0.969132 | 0.933780 |
| tas_4 | 0.992109 | 0.998128 | 0.951193 | 0.998353 | 1.000000 | 0.988546 | 0.995727 | 0.965370 | 0.930298 |
| tas_5 | 0.968863 | 0.982121 | 0.973911 | 0.988802 | 0.988546 | 1.000000 | 0.993044 | 0.987592 | 0.963742 |
| tas_6 | 0.981680 | 0.990185 | 0.968452 | 0.998118 | 0.995727 | 0.993044 | 1.000000 | 0.978340 | 0.946314 |
| tas_7 | 0.939819 | 0.955442 | 0.993253 | 0.969132 | 0.965370 | 0.987592 | 0.978340 | 1.000000 | 0.987610 |
| tas_8 | 0.899106 | 0.918523 | 0.985519 | 0.933780 | 0.930298 | 0.963742 | 0.946314 | 0.987610 | 1.000000 |
deseasoned_df.apply(linTrend)*365*10
tas_0 0.230259 tas_1 0.245758 tas_2 0.348280 tas_3 0.250211 tas_4 0.252066 tas_5 0.296777 tas_6 0.263369 tas_7 0.340650 tas_8 0.419416 pr_0 0.043812 pr_1 0.043798 pr_2 0.057806 pr_3 0.062067 pr_4 0.055505 pr_5 0.033106 pr_6 0.044956 pr_7 0.034719 pr_8 0.029534 flow 0.152001 dtype: float64
pr_df.plot(subplots=True, layout=(9, 1), figsize=(8, 25), sharex=True) # certainly not normalized
sns.despine()#)
plt.show()
# Remove seasonal cycle then estimate correlation
climatology_pr=pr_df.groupby(pr_df.index.month).mean()
climatology_pr.plot() ## seasonal cycle
# Most precipitation occurs in Summer, peaks in July, August, but remain high till November
# Lowest precipitation in February till April
<matplotlib.axes._subplots.AxesSubplot at 0x11cd54eb8>
anomaly2 = lambda x: (x - x.mean())
deseasoned_pr=pr_df.groupby(pr_df.index.month).transform(anomaly2)
sns.pairplot(deseasoned_pr)
<seaborn.axisgrid.PairGrid at 0x1185b9c18>
deseasoned_pr.apply(linTrend)*365*10 #also a positive trends in precipitation in all locations
pr_0 0.041264 pr_1 0.042278 pr_2 0.057685 pr_3 0.059515 pr_4 0.052664 pr_5 0.031307 pr_6 0.044357 pr_7 0.034299 pr_8 0.026846 dtype: float64
sns.pairplot(deseasoned_df)
<seaborn.axisgrid.PairGrid at 0x122246ba8>
for TS1, TS2 in zip(pr_df.columns, tas_df.columns):
print(deseasoned_pr[TS1].corr(deseasoned_tas[TS2]))
# After removing seasonal cycles, the correlations between rainfall and temperature at each location is very low.
0.000950865230372 -0.0011919544031 0.0462636666566 0.0148825906277 0.00181830664459 0.00199986649784 0.006312094835 0.0130738012289 0.0216622688317
# Looks very much like my Agulhas leakage timeseries.
sns.set(font_scale=2)
sns.set_style(style='white')
fig=plt.figure(figsize=(12,6))
ax = fig.add_subplot(111)
# the size of A4 paper
flow_df.plot(ax=ax) #
plt.ylim([0,90])
sns.despine()#)
plt.show()
# Remove seasonal cycle then estimate correlation
climatology_fl=flow_df.groupby(flow_df.index.month).mean()
climatology_fl.plot() ## seasonal cycle
# Highest river flow events in May.
<matplotlib.axes._subplots.AxesSubplot at 0x13a4b9ac8>
from mtspec.multitaper import mtspec
PSD,f=mtspec(flow_df, 1, 4, number_of_tapers=7)
sns.set_style('whitegrid')
plt.semilogy(f,PSD)
plt.xlim([0,0.02]) # 0.5
plt.xlabel('Frequency(cycles/day)')
Text(0.5,0,'Frequency(cycles/day)')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(noflow, df['flow'], test_size=0.3, random_state=0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('Linear Regression R squared": %.4f' % regressor.score(X_test, y_test))
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
X_train, X_test, y_train, y_test = train_test_split(noflow, df['flow'], test_size=0.3, random_state=0)
steps = [('scaler', StandardScaler()),
('reg', LinearRegression())]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
# Show scores:
from sklearn.model_selection import cross_val_score
cv_results=cross_val_score(pipeline,X_train, y_train,cv=5)
print(cv_results)
[ 0.16873677 0.16866896 0.15773643 0.17143275 0.16186534]
# Regularization Ridge Regression
from sklearn.linear_model import Ridge
steps2 = [('scaler', StandardScaler()),
('Ridge', Ridge(alpha=0.1,normalize=True))]
# Create the pipeline: pipeline
pipeline2 = Pipeline(steps2)
pipeline2.fit(X_train, y_train)
# Show scores:
cv_results=cross_val_score(pipeline2,X_train, y_train,cv=5)
print(cv_results)
[ 0.15012499 0.15650191 0.14435971 0.16003823 0.14207848]
# Regularization Lasso Regression
from sklearn.linear_model import Lasso
steps3 = [('scaler', StandardScaler()),
('Lasso', Lasso(alpha=0.1))]
# Create the pipeline: pipeline
pipeline3 = Pipeline(steps3)
pipeline3.fit(X_train, y_train)
# Show scores:
cv_results=cross_val_score(pipeline3,X_train, y_train,cv=5)
print(cv_results)
# Most important features:
Lasso_coef=pipeline3.named_steps['Lasso'].coef_
names=noflow.columns
_=plt.plot(range(len(names)),Lasso_coef)
_=plt.xticks(range(len(names)),names,rotation=60)
_=plt.ylabel('coefficients')
plt.show()
# Seems tas8, pr3, pr7, pr8 most representative for flow prediction.
[ 0.15015452 0.15669041 0.14482133 0.16032624 0.14184065]
# Remove Seasonal cycle
anomaly = lambda x: (x - x.mean())
deseasoned_df=df.groupby(df.index.month).transform(anomaly)
deseasoned_noflow=deseasoned_df.drop(['flow'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(deseasoned_noflow, deseasoned_df['flow'], test_size=0.3, random_state=0)
pipeline3.fit(X_train, y_train)
# Show scores:
from sklearn.model_selection import cross_val_score
cv_results=cross_val_score(pipeline3,X_train, y_train,cv=5)
print(cv_results)
# For all three Linear Regression, the R^2 score reduces significantly after removing seasonal cycle
[ 0.07377393 0.07656659 0.06354876 0.08314756 0.07317459]
# Use GridSearchCV to tune hyperparameters
from sklearn.model_selection import GridSearchCV
param_grid={'Lasso__alpha':np.linspace(0.1,2,20)}
lasso_cv=GridSearchCV(pipeline3,param_grid,cv=5)
lasso_cv.fit(X_train,y_train)
lasso_cv.best_score_
0.15076723082521856
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(noflow, df['flow'], test_size=0.3, random_state=0)
#X_train, X_test, y_train, y_test = train_test_split(deseasoned_noflow, deseasoned_df['flow'], test_size=0.3, random_state=0)
steps = [('scaler', StandardScaler()),
('forest', RandomForestRegressor(random_state=42))]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Random Forest R squared": %.4f' % pipeline.score(X_test, y_test))
Random Forest R squared": 0.2296
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor
X_train, X_test, y_train, y_test = train_test_split(noflow, df['flow'], test_size=0.3, random_state=0)
#X_train, X_test, y_train, y_test = train_test_split(deseasoned_noflow, deseasoned_df['flow'], test_size=0.3, random_state=0)
steps = [('scaler', StandardScaler()),
('gb', GradientBoostingRegressor())]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Gradient Boost R squared": %.4f' % pipeline.score(X_test, y_test))
Gradient Boost R squared": 0.3144
df['flow'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x159dc8860>
df['flow'].describe(percentiles=[.25, .5, .75,.90,.95])
count 21178.000000 mean 6.914146 std 9.665353 min 0.000031 25% 1.156123 50% 3.151342 75% 8.606681 90% 18.567199 95% 26.635724 max 85.397653 Name: flow, dtype: float64
# Labeling for flow > 20
df['flood'] = np.where(df['flow']>=20, 1,0)
df_labeld=df.drop(['flow','flood'], axis=1)
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(df_labeld, df['flood'], test_size=0.3, random_state=21)
knn=KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
y_prob=knn.predict_proba(X_test)
print(knn.score(X_test,y_test))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
0.907617248977
[[5721 79]
[ 508 46]]
precision recall f1-score support
0 0.92 0.99 0.95 5800
1 0.37 0.08 0.14 554
avg / total 0.87 0.91 0.88 6354
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
X_train, X_test, y_train, y_test = train_test_split(df_labeld, df['flood'], test_size=0.3, random_state=21)
# Create the classifier: logreg
logreg = LogisticRegression()
# Fit the classifier to the training data
logreg.fit(X_train,y_train)
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[5787 13]
[ 541 13]]
precision recall f1-score support
0 0.91 1.00 0.95 5800
1 0.50 0.02 0.04 554
avg / total 0.88 0.91 0.88 6354
from sklearn.metrics import roc_curve
# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}
# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_labeld, df['flood'], test_size=0.3, random_state=21)
# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)
# Fit it to the training data
logreg_cv.fit(X_train,y_train)
# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))
Tuned Logistic Regression Parameter: {'C': 0.0061054022965853268, 'penalty': 'l1'}
Tuned Logistic Regression Accuracy: 0.9106853750674582